Enter 2006 September

home *** CD-ROM | disk | FTP | other *** search

/ Enter 2006 September / Enter 09 2006.iso / Internet / SpamExperts Home 1.1 / SpamExperts Home.exe / lib / spamexperts.modules / spamexperts / fingerprint / fingerprint.pyc (.txt) < prev

Wrap

Python Compiled Bytecode | 2006-07-14 | 5.8 KB | 168 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.4) '''E-mail fingerprint extraction library Client interface to fingerprint database needed for centralized ham/spam recognition. ''' from __future__ import division import re import sys import types try: set except NameError: from sets import Set as set from spambayes import tokenizer from spamexperts.Options import options class Fingerprint(object): """This class is a factory for generating fingerprints from messages. As much as possible, text that has been added to a message to prevent fingerprint-type schemes (random text, whitespace, HTML) is removed before fingerprinting. To use, call Fingerprint.get_fingerprint() to get the fingerprint (to pass to the server as a query), and then Fingerprint.spamprob() with the query results to get a probability that the messages is spam. NOTE: fingerprint generation method is described in: Feng Zhou, Li Zhuang, Ben Y. Zhao, Ling Huang, Anthony D. Joseph, and John Kubiatowics. 'Approximate Object Location and Spam Filtering on Peer-to-Peer Systems'. Appears in Proc. of ACM/IFIP/USENIX Intl. Middleware Conf. (Middleware 2003). """ def get_ridges(self, data, normalise): if normalise: data = self.normalise(data) L = options[('fingerprint', 'l')] data_length = len(data) if data_length < L: return [ self._hash(data)] return [ self._hash(data[i:i + L]) for i in xrange(0, data_length, L) ] def _hash(chunk): '''Generate substrings, and hash them (converting to strings, and reversing). ''' return str(abs(hash(chunk)))[::-1] _hash = staticmethod(_hash) def check_ridges(self, data, normalise, ridges): L = options[('fingerprint', 'l')] if normalise: data = data.get_payload() else: return [] return [ (orig, self._hash(norm) in ridges) for orig, norm in self.split_and_norm(data, L) ] def split_and_norm(cls, data, required_length): orig_chunk = [] norm_chunk = [] in_a_row = False for c in data: orig_chunk.append(c) norm = cls.replace_re.sub(' ', c) if norm != ' ' or not in_a_row: norm_chunk.append(norm) if len(norm_chunk) == required_length: yield (''.join(orig_chunk), ''.join(norm_chunk)) orig_chunk = [] norm_chunk = [] if norm != ' ': in_a_row = False else: in_a_row = True norm != ' ' yield (''.join(orig_chunk), ''.join(norm_chunk)) split_and_norm = classmethod(split_and_norm) replace_re = re.compile('\\s+') tokenizer = tokenizer.Tokenizer() def normalise(self, msg): if options[('fingerprint', 'use_tokenizer')]: return ' '.join(self.tokenizer.tokenize_body(msg)) return self.replace_re.sub(' ', msg.get_payload()) def get_body(self, msg): '''To get the body, we walk through the message, collecting all parts (not just text, although only text parts are normalised). ''' yield (msg.get('Subject', ''), False) for part in msg.walk(): if part.get_content_maintype() == 'text': yield (part, True) continue payload = part.get_payload() if not isinstance(payload, types.ListType): yield (payload, False) continue def get_fingerprint(self, msg): '''Generation of a fingerprint from an email message. The L parameter defines the length of collected the substrings. The N parameter defines the number of selected substrings. ''' hash_substrings = [] for section, normalise in self.get_body(msg): hash_substrings.extend(self.get_ridges(section, normalise)) hash_substrings.sort() hash_substrings = hash_substrings[:options[('fingerprint', 'max-n')]] cfv = set(hash_substrings) if '' in cfv: cfv.remove('') if len(cfv) < options[('fingerprint', 'min-n')]: return set() return cfv def notate_fingerprint(self, msg, ridges): '''Returns a list of (data, matched). The idea is to then draw the message and change the colour if matched is True.''' notated = [] for section, normalise in self.get_body(msg): notated.extend(self.check_ridges(section, normalise, ridges)) return notated def spamprob(self, max_match_count, mail_fingerprints): ''' Returns the spam probability for given message to be spam, given the number of fingerprints that were matched. ''' total_count = len(mail_fingerprints) matching_count = int(max_match_count) if options[('globals', 'verbose')]: print >>sys.stderr, 'Fingerprint: Found %s matching hashes of %s total hashes' % (matching_count, total_count) if total_count != 0: f_prob = 0.5 + matching_count / total_count / 2 else: f_prob = 0.5 return f_prob